Load and Explore Data
strokes <- read.csv("./data/stroke_data.csv")
str(strokes)
'data.frame': 5110 obs. of 12 variables:
$ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
$ gender : chr "Male" "Female" "Male" "Female" ...
$ age : num 67 61 80 49 79 81 74 69 59 78 ...
$ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
$ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
$ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
$ work_type : chr "Private" "Self-employed" "Private" "Private" ...
$ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
$ avg_glucose_level: num 229 202 106 171 174 ...
$ bmi : chr "36.6" "N/A" "32.5" "34.4" ...
$ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
$ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
summary(strokes)
id gender age hypertension heart_disease
Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000 Min. :0.00000
1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000 1st Qu.:0.00000
Median :36932 Mode :character Median :45.00 Median :0.00000 Median :0.00000
Mean :36518 Mean :43.23 Mean :0.09746 Mean :0.05401
3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :72940 Max. :82.00 Max. :1.00000 Max. :1.00000
ever_married work_type Residence_type avg_glucose_level bmi
Length:5110 Length:5110 Length:5110 Min. : 55.12 Length:5110
Class :character Class :character Class :character 1st Qu.: 77.25 Class :character
Mode :character Mode :character Mode :character Median : 91.89 Mode :character
Mean :106.15
3rd Qu.:114.09
Max. :271.74
smoking_status stroke
Length:5110 Min. :0.00000
Class :character 1st Qu.:0.00000
Mode :character Median :0.00000
Mean :0.04873
3rd Qu.:0.00000
Max. :1.00000
Verify that the ID column is in fact unique
Remove ID column since this is an assigned identifier and as such does not impact stroke
uniqueID <- unique(strokes$id)
length(uniqueID)
[1] 5110
dim(strokes)
[1] 5110 12
strokes <- strokes[-1]
Since all the positive stroke values are found at the start of the file we will randomize the dataframe
set.seed(42)
rows <- sample(nrow(strokes))
strokes <- strokes[rows,]
Review category distrubution for categorical variables
Update binary categorical variables to numeric
remove gender with value of ‘other’, code "Female as 0, male as 1
strokes <- strokes %>% filter(gender != "Other")
strokes$gender <- ifelse(strokes$gender == "Female", 0,1)
convert residence type to binary response. 0=rural, 1=urban
strokes$Residence_type = ifelse(strokes$Residence_type == "Rural", 0, 1)
convert ever married to binary response. 0=No, 1=Yes
strokes$ever_married <- ifelse(strokes$ever_married == 'No', 0,1)
Convert stroke to ‘yes/no’ values
strokes$stroke <- ifelse(strokes$stroke == 1, "Yes", "No")
Convert multi-category categorical features to factors
# strokes$gender <- as.factor(strokes$gender)
# strokes$hypertension <- as.factor(strokes$hypertension)
# strokes$heart_disease <- as.factor(strokes$heart_disease)
# strokes$ever_married <- as.factor(strokes$ever_married)
strokes$work_type <- as.factor(strokes$work_type)
# strokes$Residence_type <- as.factor(strokes$Residence_type)
strokes$smoking_status <- as.factor(strokes$smoking_status)
strokes$stroke <- as.factor(strokes$stroke)
strokes$smoking_status <- revalue(strokes$smoking_status, c("formerly smoked"="formerly_smoked", "never smoked"="never_smoked", "smokes"="smokes", "Unknown"="unknown"))
strokes$work_type <- revalue(strokes$work_type, c("Self-employed"="Self_employed"))
Clean bmi feature
- Convert “N/A” entries to NA values
- Convert from character to numeric values
strokes <- na_if(strokes,"N/A")
strokes$bmi <- as.numeric(strokes$bmi)
dim(strokes)
[1] 5109 11
Data Exploration
Missing Values
Table continues below
| 0 |
0 |
0 |
0 |
0 |
0 |
require(tidyverse)
require(rcompanion)
# Calculate a pairwise association between all variables in a data-frame. In particular nominal vs nominal with Chi-square, numeric vs numeric with Pearson correlation, and nominal vs numeric with ANOVA.
# Adopted from https://stackoverflow.com/a/52557631/590437
mixed_assoc = function(df, cor_method="spearman", adjust_cramersv_bias=TRUE){
df_comb = expand.grid(names(df), names(df), stringsAsFactors = F) %>% set_names("X1", "X2")
is_nominal = function(x) class(x) %in% c("factor", "character")
# https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
# https://github.com/r-lib/rlang/issues/781
is_numeric <- function(x) { is.integer(x) || is_double(x)}
f = function(xName,yName) {
x = pull(df, xName)
y = pull(df, yName)
result = if(is_nominal(x) && is_nominal(y)){
# use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
cv = cramerV(as.character(x), as.character(y), bias.correct = adjust_cramersv_bias)
data.frame(xName, yName, assoc=cv, type="cramersV")
}else if(is_numeric(x) && is_numeric(y)){
correlation = cor(x, y, method=cor_method, use="complete.obs")
data.frame(xName, yName, assoc=correlation, type="correlation")
}else if(is_numeric(x) && is_nominal(y)){
# from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
r_squared = summary(lm(x ~ y))$r.squared
data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")
}else if(is_nominal(x) && is_numeric(y)){
r_squared = summary(lm(y ~x))$r.squared
data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")
}else {
warning(paste("unmatched column type combination: ", class(x), class(y)))
}
# finally add complete obs number and ratio to table
result %>% mutate(complete_obs_pairs=sum(!is.na(x) & !is.na(y)), complete_obs_ratio=complete_obs_pairs/length(x)) %>% rename(x=xName, y=yName)
}
# apply function to each variable combination
map2_df(df_comb$X1, df_comb$X2, f)
}
Associations between features
associations <- mixed_assoc(strokes)
associations
bmi imputation
bmi has multiple missing values. Instead of using the mean or median of the entire data set to impute the missing values determine if a second feature has an association with bmi
association of bmi with other variables
bmi_assoc <- associations %>% filter(y == "bmi")
Plots of bmi vs other features
# ggplot(data=associations, aes(x=x, y=y, fill=assoc)) + geom_tile() + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1), axis.title = element_blank(), plot.title=element_text(hjust=0.5)) + ggtitle("Association Between Features")
ggplot(data=bmi_assoc, aes(x=x, y=y, fill=assoc)) + geom_tile() + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1, size=20), axis.text.y = element_text(size=20),axis.title = element_blank(), plot.title=element_text(hjust=0.5)) + ggtitle("Association with bmi")

plot(bmi ~ ., data=strokes)










Age has an association with bmi, so the data will be binned by age and the missing bmi values will be imputed based upon the median value of the bin they fall in
Verify missing values are resolved and data types are as expected
Missing Values
Table continues below
| 0 |
0 |
0 |
0 |
0 |
0 |
str(strokes)
tibble [5,109 × 11] (S3: tbl_df/tbl/data.frame)
$ gender : num [1:5109] 0 0 0 1 1 0 1 1 0 1 ...
$ age : num [1:5109] 56 16 28 52 4 42 65 31 66 64 ...
$ hypertension : int [1:5109] 0 0 0 0 0 0 0 0 1 0 ...
$ heart_disease : int [1:5109] 0 0 0 0 0 0 0 0 0 0 ...
$ ever_married : num [1:5109] 1 0 0 1 0 1 1 1 1 1 ...
$ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 3 4 4 1 2 4 2 4 4 ...
$ Residence_type : num [1:5109] 0 1 0 1 0 1 1 1 0 1 ...
$ avg_glucose_level: num [1:5109] 77.7 102.1 96.9 229.2 103.8 ...
$ bmi : num [1:5109] 40.8 27.1 29 35.6 15.9 29.8 28.4 30.4 39.5 28.3 ...
$ smoking_status : Factor w/ 4 levels "formerly_smoked",..: 2 2 4 1 4 4 3 1 2 4 ...
$ stroke : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
Explore relation of features with response variable ‘stroke’
p1 <- ggplot(data=strokes, aes(stroke, fill=as.factor(gender))) + geom_bar(position='fill') + labs(fill="gender")
p2 <- ggplot(data=strokes, aes(stroke, age)) + geom_boxplot()
p3 <- ggplot(data=strokes, aes(stroke, fill=as.factor(hypertension))) + geom_bar(position='fill') + labs(fill="hypertension")
p4 <- ggplot(data=strokes, aes(stroke, fill=as.factor(heart_disease))) + geom_bar(position='fill') + labs(fill='heart_disease')
p5 <- ggplot(data=strokes, aes(stroke, fill=as.factor(ever_married))) + geom_bar(position='fill') + labs(fill="ever_married")
p6 <- ggplot(data=strokes, aes(stroke, fill=work_type)) + geom_bar(position='fill')
p7 <- ggplot(data=strokes, aes(stroke, fill=as.factor(Residence_type))) + geom_bar(position='fill') + labs(fill="Residence_type")
p8 <- ggplot(data=strokes, aes(stroke, avg_glucose_level)) + geom_boxplot()
p9 <- ggplot(data=strokes, aes(stroke, bmi)) + geom_boxplot()
p10 <- ggplot(data=strokes, aes(stroke, fill=smoking_status)) + geom_bar(position='fill')
grid.arrange(p1,p2,p3,p4,nrow=2)

grid.arrange(p5,p6,p7,p8,nrow=2)

grid.arrange(p9,p10,nrow=1)

Chi-squared approximation may be incorrect
Chi-Square Tests
| smoking_status |
2.008e-06 |
| gender |
0.5598 |
| hypertension |
1.689e-19 |
| heart_disease |
2.121e-21 |
| ever_married |
1.686e-14 |
| work_type |
5.409e-10 |
| residence_type |
0.2998 |
T-Tests
| age |
2.176e-95 |
| glucose |
2.373e-11 |
| bmi |
0.0003064 |
The features gender and residence type do not show a statistically significant association with the stroke outcome. This can be observed visually as well as by the Chi-Square tests.
Review distribution of numeric features
hist(strokes$bmi, main="bmi", xlab="bmi")

hist(strokes$avg_glucose_level, main="average glucose level", xlab="avg_glucose_level")

hist(strokes$age, main="age", xlab="age")

NA
NA
Both avg_glucose_level and bmi are right-skewed. To help correct this both variables will be replaced with their respective log values
strokes$bmi <- log(strokes$bmi)
strokes$avg_glucose_level <- log(strokes$avg_glucose_level)
hist(strokes$bmi)

hist(strokes$avg_glucose_level)

Since neither gender nor residence_type show any statistically significant correlation with the stroke outcome both features will be removed to avoid any unintentional modle complexity or noise
strokes <- strokes[-c(1,7)]
Prepare Data Sets for Model Usage
- Split into train, validation and test datasets
- Create balanced training data sets
- Separate into features and labels
- Scale Datasets
- One-Hot Encode Datasets
# create train/test dataset as well as subset of train for cross-validation and evaluation usage
set.seed(123)
selection <- createDataPartition(strokes$stroke, p=0.8, list=FALSE)
trainDataFull = strokes[selection,]
testData = strokes[-selection,]
set.seed(123)
selection2 <- createDataPartition(trainDataFull$stroke, p=0.8, list=FALSE)
trainData = trainDataFull[selection2,]
valData = trainDataFull[-selection2,]
#create balanced training datasets
trainDataBalanced <- ROSE(stroke ~ ., data=trainData, p=0.5)$data
trainDataFullBalanced <- ROSE(stroke ~ ., data=trainDataFull, p=0.5)$data
# separate features and labels
valDataFeatures <- valData[-9]
valDataLabels <- valData[9]
testDataFeatures <- testData[-9]
testDataLabels <- testData[9]
trainDataFeatures <- trainData[-9]
trainDataLabels <- trainData[9]
trainDataFullFeatures <- trainDataFull[-9]
trainDataFullLabels <- trainDataFull[9]
trainDataBalancedFeatures <- trainDataBalanced[-9]
trainDataBalancedLabels <- trainDataBalanced[9]
trainDataFullBalancedFeatures <- trainDataFullBalanced[-9]
trainDataFullBalancedLabels <- trainDataFullBalanced[9]
# scale features data sets. testData scales with trainDataFull while valData scales with trainData
preProcScaleFullTrain <- preProcess(trainDataFullBalancedFeatures, method=c("center", "scale"))
preProcScaleTrain <- preProcess(trainDataBalancedFeatures, method=c("center", "scale"))
trainDataFullBalancedFeatures_s <- predict(preProcScaleFullTrain, trainDataFullBalancedFeatures)
testDataFeatures_s <- predict(preProcScaleFullTrain, testDataFeatures)
trainDataBalancedFeatures_s <- predict(preProcScaleTrain, trainDataBalancedFeatures)
valDataFeatures_s <- predict(preProcScaleTrain, valDataFeatures)
# one-hot encode datasets
dummy1 <- dummyVars("~ .", data=trainDataFullBalancedFeatures_s)
trainDataFullBalancedFeatures_se <- data.frame(predict(dummy1,newdata=trainDataFullBalancedFeatures_s))
dummy2 <- dummyVars("~ .", data=trainDataBalancedFeatures)
trainDataBalancedFeatures_e <- data.frame(predict(dummy2,newdata=trainDataBalancedFeatures))
dummy3 <- dummyVars("~ .", data=trainDataFeatures)
trainDataFeatures_e <- data.frame(predict(dummy3,newdata=trainDataFeatures))
dummy4 <- dummyVars("~ .", data=trainDataFullFeatures)
trainDataFullFeatures_e <- data.frame(predict(dummy4,newdata=trainDataFullFeatures))
testDataFeatures_se <- data.frame(predict(dummy1,newdata=testDataFeatures_s))
valDataFeatures_e <- data.frame(predict(dummy2,newdata=valDataFeatures))
fullsize <- dim(trainDataFull)
trainsize <- dim(trainData)
valsize <- dim(valData)
testsize <- dim(testData)
distfull <- table(trainDataFull$stroke)
disttrain <- table(trainData$stroke)
distval <- table(valData$stroke)
disttest <- table(testData$stroke)
Verify dimensions and distribution of response variable for datasets
The proportion of “no” to “yes” values in all datasets is equivalent.
Fit Models
Define control for models trained with Caret
myControl <- trainControl(method='repeatedcv',
number=10,
repeats=5,
verboseIter = FALSE,
classProbs=TRUE,
summaryFunction=twoClassSummary,
selectionFunction="oneSE",
sampling='rose')
myControl2 <- trainControl(method='repeatedcv',
number=10,
repeats=5,
verboseIter = FALSE,
classProbs=TRUE,
summaryFunction=twoClassSummary,
selectionFunction="oneSE")
KNN Model Training and Evaluation
Ensuring the square root of the number of observations is contained in the values of k being explored
set.seed(123)
knn_grid <- expand.grid(k=seq(1,100,by=2))
knn_train <- cbind(trainDataFeatures_e, trainDataLabels)
knn_model <- caret::train(stroke ~ ., data=knn_train, method='knn', trControl=myControl, tuneGrid = knn_grid, preProc = 'range', metric='ROC')
plot(knn_model)

knn_pred <- predict(knn_model, newdata=valDataFeatures_e)
knn_prob <- predict(knn_model, newdata=valDataFeatures_e, type='prob')
knn_results <- confusionMatrix(knn_pred, valDataLabels$stroke, positive = "Yes")
knn_results2 <- data.frame(as.factor(knn_pred), as.factor(valDataLabels$stroke), knn_prob)
colnames(knn_results2) <- c("pred", "obs", "No", "Yes")
knn_model
k-Nearest Neighbors
3271 samples
15 predictor
2 classes: 'No', 'Yes'
Pre-processing: re-scaling to [0, 1] (15)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 2943, 2944, 2944, 2944, 2944, 2944, ...
Addtional sampling using ROSE prior to pre-processing
Resampling results across tuning parameters:
k ROC Sens Spec
1 0.6105385 0.8048271 0.41625
3 0.6951932 0.8315020 0.42375
5 0.7247873 0.8163251 0.46125
7 0.7548163 0.8279003 0.45875
9 0.7566825 0.8289298 0.47500
11 0.7614193 0.8243650 0.49000
13 0.7684376 0.8212126 0.49750
15 0.7703217 0.8217953 0.50625
17 0.7678556 0.8244956 0.47125
19 0.7788091 0.8132480 0.51250
21 0.7773042 0.8133047 0.50125
23 0.7771587 0.8135640 0.53500
25 0.7789210 0.8165871 0.51375
27 0.7874020 0.8105468 0.54000
29 0.7852502 0.8106664 0.52125
31 0.7866838 0.8080365 0.55000
33 0.7889458 0.8060413 0.54125
35 0.7820573 0.8046302 0.55500
37 0.7818359 0.8024411 0.54125
39 0.7847358 0.8050791 0.54125
41 0.7870788 0.7989082 0.58000
43 0.7808888 0.7982614 0.56250
45 0.7808209 0.7917104 0.56625
47 0.7874359 0.7952428 0.57250
49 0.7784092 0.7935706 0.56750
51 0.7859306 0.7906159 0.58875
53 0.7840340 0.7929279 0.58125
55 0.7859753 0.7889426 0.59125
57 0.7896676 0.7908739 0.59125
59 0.7863470 0.7875934 0.59125
61 0.7871621 0.7915774 0.58625
63 0.7889519 0.7896496 0.58625
65 0.7865711 0.7832791 0.60250
67 0.7867171 0.7789822 0.60750
69 0.7893115 0.7787804 0.61375
71 0.7886373 0.7805204 0.61500
73 0.7825669 0.7715238 0.61000
75 0.7852127 0.7722908 0.61750
77 0.7892454 0.7787225 0.61125
79 0.7843856 0.7742163 0.60875
81 0.7882281 0.7762794 0.61625
83 0.7847296 0.7742798 0.61125
85 0.7900390 0.7643229 0.63500
87 0.7892112 0.7690075 0.63750
89 0.7883087 0.7713262 0.60875
91 0.7825409 0.7645117 0.61250
93 0.7864030 0.7708818 0.61625
95 0.7885036 0.7717120 0.61500
97 0.7855760 0.7701090 0.61875
99 0.7874074 0.7652774 0.64750
ROC was used to select the optimal model using the one SE rule.
The final value used for the model was k = 99.
knn_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 604 15
Yes 173 25
Accuracy : 0.7699
95% CI : (0.7395, 0.7983)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0.14
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.62500
Specificity : 0.77735
Pos Pred Value : 0.12626
Neg Pred Value : 0.97577
Prevalence : 0.04896
Detection Rate : 0.03060
Detection Prevalence : 0.24235
Balanced Accuracy : 0.70117
'Positive' Class : Yes
knn_2class <- twoClassSummary(knn_results2, lev = c("No", "Yes"))
knn_2class[1]
ROC
0.7888996
knn_roc <- roc(valDataLabels$stroke ~ knn_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

Logistic Regression Model with Elastic Net Regularization
set.seed(123)
glm_grid <- expand.grid(alpha=seq(0,1, length=10), lambda = 10^seq(-3,3, length=100))
glm_train <- cbind(trainDataFeatures_e, trainDataLabels)
glm_model <- caret::train(stroke ~ ., data=glm_train, method='glmnet', trControl=myControl, tuneGrid = glm_grid, preProc = "range", metric='ROC', family='binomial')
glm_pred <- predict(glm_model, newdata = valDataFeatures_e)
glm_prob <- predict(glm_model, newdata = valDataFeatures_e, type='prob')
glm_results <- confusionMatrix(glm_pred, valDataLabels$stroke, positive="Yes")
glm_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 551 6
Yes 226 34
Accuracy : 0.716
95% CI : (0.6838, 0.7467)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0.155
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.85000
Specificity : 0.70914
Pos Pred Value : 0.13077
Neg Pred Value : 0.98923
Prevalence : 0.04896
Detection Rate : 0.04162
Detection Prevalence : 0.31824
Balanced Accuracy : 0.77957
'Positive' Class : Yes
glm_results2 <- data.frame(as.factor(glm_pred), as.factor(valDataLabels$stroke), glm_prob)
colnames(glm_results2) <- c("pred", "obs", "No", "Yes")
glm_2class <- twoClassSummary(glm_results2, lev=c("No", "Yes"))
glm_2class[1]
ROC
0.8555341
glm_roc <- roc(valDataLabels$stroke ~ glm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

coef(glm_model$finalModel,glm_model$bestTune$lambda)
16 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -1.166219516
age 1.480281231
hypertension 0.333618442
heart_disease 0.001788369
ever_married 0.211458168
work_type.children -0.249526776
work_type.Govt_job .
work_type.Never_worked .
work_type.Private .
work_type.Self_employed .
avg_glucose_level 0.124124895
bmi .
smoking_status.formerly_smoked .
smoking_status.never_smoked .
smoking_status.smokes .
smoking_status.unknown .
varImp(glm_model)
glmnet variable importance
Decision Tree Model
set.seed(123)
tree_grid <- expand.grid(cp=seq(0.0,1.0,by=.001))
tree_data <- cbind(trainDataFeatures, trainDataLabels)
# tree_model <- caret::train(stroke ~.,data=tree_data, method='rpart', trControl=myControl, preProc = "range", metric='ROC', tuneGrid=tree_grid)
tree_model <- caret::train(stroke ~.,data=tree_data, method='rpart', trControl=myControl, metric='ROC', tuneGrid=tree_grid)
tree_pred <- predict(tree_model, valDataFeatures)
tree_prob <- predict(tree_model, newdata = valDataFeatures, type='prob')
tree_results <- confusionMatrix(tree_pred, valDataLabels$stroke, positive="Yes")
tree_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 0 0
Yes 777 40
Accuracy : 0.049
95% CI : (0.0352, 0.0661)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.00000
Specificity : 0.00000
Pos Pred Value : 0.04896
Neg Pred Value : NaN
Prevalence : 0.04896
Detection Rate : 0.04896
Detection Prevalence : 1.00000
Balanced Accuracy : 0.50000
'Positive' Class : Yes
tree_results2 <- data.frame(as.factor(tree_pred), as.factor(valDataLabels$stroke), tree_prob)
colnames(tree_results2) <- c("pred", "obs", "No", "Yes")
tree_2class <- twoClassSummary(tree_results2, lev=c("No", "Yes"))
tree_2class[1]
ROC
0.5
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

fancyRpartPlot(tree_model$finalModel)

Considering the poor performance of the decision tree, a second model will be built but will not balance the dataset prior to use
set.seed(123)
tree_grid <- expand.grid(cp=seq(0.0,1.0,by=.001))
tree_data <- cbind(trainDataFeatures, trainDataLabels)
tree_model <- caret::train(stroke ~.,data=tree_data, method='rpart', trControl=myControl2, preProc = "range", metric='ROC', tuneGrid=tree_grid)
tree_pred <- predict(tree_model, valDataFeatures)
tree_prob <- predict(tree_model, newdata = valDataFeatures, type='prob')
tree_results <- confusionMatrix(tree_pred, valDataLabels$stroke, positive="Yes")
tree_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 772 36
Yes 5 4
Accuracy : 0.9498
95% CI : (0.9325, 0.9638)
No Information Rate : 0.951
P-Value [Acc > NIR] : 0.6049
Kappa : 0.1479
Mcnemar's Test P-Value : 2.797e-06
Sensitivity : 0.100000
Specificity : 0.993565
Pos Pred Value : 0.444444
Neg Pred Value : 0.955446
Prevalence : 0.048960
Detection Rate : 0.004896
Detection Prevalence : 0.011016
Balanced Accuracy : 0.546782
'Positive' Class : Yes
tree_results2 <- data.frame(as.factor(tree_pred), as.factor(valDataLabels$stroke), tree_prob)
colnames(tree_results2) <- c("pred", "obs", "No", "Yes")
tree_2class <- twoClassSummary(tree_results2, lev=c("No", "Yes"))
tree_2class[1]
ROC
0.706435
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

fancyRpartPlot(tree_model$finalModel)

Random Forest Model
rf_grid <- expand.grid(mtry=c(1,2,4,6,8))
set.seed(123)
rf_data <- cbind(trainDataFeatures, trainDataLabels)
rf_model <- caret::train(stroke ~.,data=rf_data, method='rf', trControl=myControl, preProc = "range", metric='ROC', tuneGrid=rf_grid)
rf_pred <- predict(rf_model, valDataFeatures)
rf_prob <- predict(rf_model, valDataFeatures, type='prob')
rf_results <- confusionMatrix(rf_pred,valDataLabels$stroke, positive="Yes")
rf_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 416 2
Yes 361 38
Accuracy : 0.5557
95% CI : (0.5209, 0.5901)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0.0923
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.95000
Specificity : 0.53539
Pos Pred Value : 0.09524
Neg Pred Value : 0.99522
Prevalence : 0.04896
Detection Rate : 0.04651
Detection Prevalence : 0.48837
Balanced Accuracy : 0.74270
'Positive' Class : Yes
rf_results2 <- data.frame(as.factor(rf_pred), as.factor(valDataLabels$stroke), rf_prob)
colnames(rf_results2) <- c("pred", "obs", "No", "Yes")
rf_2class <- twoClassSummary(rf_results2, lev=c("No", "Yes"))
rf_2class[1]
ROC
0.8387227
rf_roc <- roc(valDataLabels$stroke ~ rf_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve")
Setting levels: control = No, case = Yes
Setting direction: controls < cases

varImp(rf_model, scale = TRUE)
rf variable importance
rf_model
Random Forest
3271 samples
8 predictor
2 classes: 'No', 'Yes'
Pre-processing: re-scaling to [0, 1] (13)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 2943, 2944, 2944, 2944, 2944, 2944, ...
Addtional sampling using ROSE prior to pre-processing
Resampling results across tuning parameters:
mtry ROC Sens Spec
1 0.8147439 0.539185011 0.92125
2 0.8100990 0.210691112 0.99125
4 0.7847946 0.004501195 1.00000
6 0.7651958 0.004501195 1.00000
8 0.7474645 0.004501195 1.00000
ROC was used to select the optimal model using the one SE rule.
The final value used for the model was mtry = 1.
plot(rf_model)

Linear SVM Model
set.seed(123)
lsvm_grid <- expand.grid(C = 3**(-7:7))
lsvm_data <- cbind(trainDataFeatures_e, trainDataLabels)
lsvm_model <- caret::train(stroke ~., data = lsvm_data, method = "svmLinear", trControl = myControl, preProcess = 'range', tuneGrid = lsvm_grid, metric= 'ROC')
lsvm_pred <- predict(lsvm_model, valDataFeatures_e)
lsvm_prob <- predict(lsvm_model, valDataFeatures_e, type='prob')
lsvm_results <- confusionMatrix(lsvm_pred, valDataLabels$stroke, positive="Yes")
lsvm_results2 <- data.frame(as.factor(lsvm_pred), as.factor(valDataLabels$stroke), lsvm_prob)
lsvm_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 584 7
Yes 193 33
Accuracy : 0.7552
95% CI : (0.7242, 0.7843)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0.1799
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.82500
Specificity : 0.75161
Pos Pred Value : 0.14602
Neg Pred Value : 0.98816
Prevalence : 0.04896
Detection Rate : 0.04039
Detection Prevalence : 0.27662
Balanced Accuracy : 0.78830
'Positive' Class : Yes
colnames(lsvm_results2) <- c("pred", "obs", "No", "Yes")
lsvm_2class <- twoClassSummary(lsvm_results2, lev=c("No", "Yes"))
lsvm_2class[1]
ROC
0.8487773
varImp(lsvm_model, scale = TRUE)
ROC curve variable importance
lsvm_roc <- roc(valDataLabels$stroke ~ lsvm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

plot(lsvm_model, scales=list(x=list(log=3)))

lsvm_model
Support Vector Machines with Linear Kernel
3271 samples
15 predictor
2 classes: 'No', 'Yes'
Pre-processing: re-scaling to [0, 1] (15)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 2943, 2944, 2944, 2944, 2944, 2944, ...
Addtional sampling using ROSE prior to pre-processing
Resampling results across tuning parameters:
C ROC Sens Spec
4.572474e-04 0.8223611 0.7261314 0.77000
1.371742e-03 0.8305492 0.7266456 0.79875
4.115226e-03 0.8371825 0.7276736 0.81625
1.234568e-02 0.8363893 0.7294760 0.80875
3.703704e-02 0.8366878 0.7251682 0.81625
1.111111e-01 0.8377090 0.7283768 0.80875
3.333333e-01 0.8391237 0.7261312 0.81250
1.000000e+00 0.8384358 0.7249744 0.81375
3.000000e+00 0.8372659 0.7269680 0.81125
9.000000e+00 0.8369120 0.7234337 0.80875
2.700000e+01 0.8387552 0.7233682 0.81625
8.100000e+01 0.8389665 0.7263268 0.81625
2.430000e+02 0.8384265 0.7249740 0.81625
7.290000e+02 0.8341659 0.7413742 0.78125
2.187000e+03 0.8366235 0.7315947 0.81000
ROC was used to select the optimal model using the one SE rule.
The final value used for the model was C = 0.004115226.
Gradient Boost Model
gbm_Grid <- expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
n.trees = seq(250,2000,250),
shrinkage = seq(.0005, .05,.0005),
n.minobsinnode = 10)
set.seed(123)
gbm_data = cbind(trainDataFeatures_e, trainDataLabels)
gbm_model <- caret::train(stroke ~., data = gbm_data, method = "gbm", trControl = myControl, preProcess = 'range', metric= 'ROC', tuneLength = 10)
gbm_pred <- predict(gbm_model, valDataFeatures_e)
gbm_prob <- predict(gbm_model, valDataFeatures_e, type='prob')
gbm_results <- confusionMatrix(gbm_pred, valDataLabels$stroke, positive='Yes')
gbm_results
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 3 0
Yes 774 40
Accuracy : 0.0526
95% CI : (0.0383, 0.0702)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 4e-04
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.000000
Specificity : 0.003861
Pos Pred Value : 0.049140
Neg Pred Value : 1.000000
Prevalence : 0.048960
Detection Rate : 0.048960
Detection Prevalence : 0.996328
Balanced Accuracy : 0.501931
'Positive' Class : Yes
gbm_results2 <- data.frame(as.factor(gbm_pred), as.factor(valDataLabels$stroke), gbm_prob)
colnames(gbm_results2) <- c("pred", "obs", "No", "Yes")
gbm_2class <- twoClassSummary(gbm_results2, lev=c("No", "Yes"))
gbm_2class[1]
ROC
0.6796493
gbm_roc <- roc(valDataLabels$stroke ~ gbm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases

gbm_model
Stochastic Gradient Boosting
3271 samples
15 predictor
2 classes: 'No', 'Yes'
Pre-processing: re-scaling to [0, 1] (15)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 2943, 2944, 2944, 2944, 2944, 2944, ...
Addtional sampling using ROSE prior to pre-processing
Resampling results across tuning parameters:
interaction.depth n.trees ROC Sens Spec
1 50 0.5120476 0.01311856 0.99625
1 100 0.5120396 0.01614107 0.99500
1 150 0.5284698 0.01311856 0.99625
1 200 0.5718248 0.01909927 0.99375
1 250 0.5915129 0.01909927 0.99375
1 300 0.6157874 0.01607676 0.99500
1 350 0.6248188 0.01607676 0.99500
1 400 0.6370812 0.01607676 0.99500
1 450 0.6420189 0.01909927 0.99375
1 500 0.6442202 0.01607676 0.99500
2 50 0.5164203 0.01909927 0.99375
2 100 0.5623875 0.01909927 0.99375
2 150 0.5850932 0.01909927 0.99375
2 200 0.5986393 0.01909927 0.99375
2 250 0.6160540 0.01916357 0.99375
2 300 0.6317473 0.01922788 0.99375
2 350 0.6487803 0.01646261 0.99500
2 400 0.6537964 0.01652692 0.99500
2 450 0.6606227 0.01652692 0.99500
2 500 0.6647613 0.01652692 0.99500
3 50 0.5114126 0.01909927 0.99375
3 100 0.5378068 0.01909927 0.99375
3 150 0.5664027 0.01909927 0.99375
3 200 0.5855627 0.01909927 0.99375
3 250 0.5919699 0.01916357 0.99375
3 300 0.6096614 0.01916357 0.99375
3 350 0.6172812 0.01916357 0.99375
3 400 0.6210599 0.01929219 0.99375
3 450 0.6283093 0.01929219 0.99375
3 500 0.6380882 0.01929219 0.99375
4 50 0.5192763 0.01909927 0.99375
4 100 0.5415156 0.01909927 0.99375
4 150 0.5633766 0.01909927 0.99375
4 200 0.5706053 0.01909927 0.99375
4 250 0.5790300 0.01909927 0.99375
4 300 0.5897672 0.01909927 0.99375
4 350 0.6253183 0.01922788 0.99375
4 400 0.6463812 0.01922788 0.99375
4 450 0.6583945 0.01922788 0.99375
4 500 0.6624801 0.01922788 0.99375
5 50 0.4982885 0.01909927 0.99375
5 100 0.5213421 0.01909927 0.99375
5 150 0.5531265 0.01909927 0.99375
5 200 0.5872584 0.01909927 0.99375
5 250 0.6011820 0.01909927 0.99375
5 300 0.6084796 0.01967660 0.99375
5 350 0.6348726 0.01993301 0.99375
5 400 0.6606058 0.02006122 0.99375
5 450 0.6667146 0.02044666 0.99375
5 500 0.6759749 0.02057527 0.99375
6 50 0.5229865 0.01909927 0.99375
6 100 0.5431204 0.01909927 0.99375
6 150 0.5414926 0.01909927 0.99375
6 200 0.5618280 0.01909927 0.99375
6 250 0.5709914 0.01909927 0.99375
6 300 0.5918941 0.01916357 0.99375
6 350 0.6107437 0.01916357 0.99375
6 400 0.6242674 0.01942081 0.99375
6 450 0.6414063 0.02173592 0.99375
6 500 0.6391775 0.02135007 0.99375
7 50 0.4739808 0.01909927 0.99375
7 100 0.4692592 0.01909927 0.99375
7 150 0.4962372 0.01909927 0.99375
7 200 0.5336867 0.01909927 0.99375
7 250 0.5601293 0.02135007 0.99375
7 300 0.5752217 0.02154300 0.99375
7 350 0.5990514 0.02212177 0.99375
7 400 0.6127874 0.02270055 0.99375
7 450 0.6220114 0.02334364 0.99375
7 500 0.6380094 0.02372949 0.99375
8 50 0.4832537 0.01909927 0.99375
8 100 0.5032918 0.01909927 0.99375
8 150 0.5346193 0.01909927 0.99375
8 200 0.5394732 0.01916357 0.99375
8 250 0.5726730 0.01935650 0.99375
8 300 0.5902557 0.01980666 0.99375
8 350 0.6110737 0.01993528 0.99375
8 400 0.6203771 0.01999959 0.99375
8 450 0.6540952 0.02012821 0.99375
8 500 0.6631274 0.02019251 0.99375
9 50 0.5291433 0.01909927 0.99375
9 100 0.5304313 0.01909927 0.99375
9 150 0.5330051 0.01909927 0.99375
9 200 0.5486256 0.01909927 0.99375
9 250 0.5942750 0.02295779 0.99375
9 300 0.6119054 0.02398673 0.99375
9 350 0.6214457 0.02572079 0.99375
9 400 0.6257709 0.02655578 0.99375
9 450 0.6386572 0.02668419 0.99375
9 500 0.6522057 0.02674829 0.99375
10 50 0.5091538 0.01909927 0.99375
10 100 0.5384197 0.01909927 0.99375
10 150 0.5509515 0.01909927 0.99375
10 200 0.5710913 0.01909927 0.99375
10 250 0.5958467 0.01909927 0.99375
10 300 0.6124699 0.02077047 0.99375
10 350 0.6356838 0.02250495 0.99375
10 400 0.6526590 0.02282649 0.99375
10 450 0.6743980 0.02417615 0.99375
10 500 0.6833120 0.02417615 0.99375
Tuning parameter 'shrinkage' was held constant at a value of 0.1
Tuning parameter 'n.minobsinnode' was held constant at a value of 10
ROC was used to select the optimal model using the one SE rule.
The final values used for the model were n.trees = 450, interaction.depth = 10, shrinkage = 0.1 and n.minobsinnode = 10.
plot(gbm_model)

Neural Network Model
Scale datasets for usage
numerical <- c(1,6,7)
trainDataFullNum <- trainDataFull[numerical]
trainDataFullCat <- trainDataFull[-numerical]
trainDataNum <- trainData[numerical]
trainDataCat <- trainData[-numerical]
valDataNum <- valData[numerical]
valDataCat <- valData[-numerical]
testDataNum <- testData[numerical]
testDataCat <- testData[-numerical]
trainDataFullScaled <- scale(trainDataFullNum)
col_means_train <- attr(trainDataFullScaled, "scaled:center")
col_stddevs_train <- attr(trainDataFullScaled, "scaled:scale")
testDataScaled <- scale(testDataNum, center = col_means_train, scale = col_stddevs_train)
trainDataScaled <- scale(trainDataNum)
col_means_train <- attr(trainDataScaled, "scaled:center")
col_stddevs_train <- attr(trainDataScaled, "scaled:scale")
valDataScaled <- scale(valDataNum, center = col_means_train, scale = col_stddevs_train)
nnFullTrainData <- cbind(trainDataFullScaled, trainDataFullCat)
nnTrainData <- cbind(trainDataScaled, trainDataCat)
nnTestData <- cbind(testDataScaled, testDataCat)
nnValData <- cbind(valDataScaled, valDataCat)
# balance training data
nnTrainData <- ROSE(stroke ~ ., data=nnTrainData, p=0.5)$data
nnFullTrainData <- ROSE(stroke ~ ., data=nnFullTrainData, p=0.5)$data
nnValBal <- ROSE(stroke ~ ., data=nnValData, p=0.5)$data
separate features from labels
nnFullTrainFeatures <- nnFullTrainData[-9]
nnFullTrainLabels <- nnFullTrainData[9]
nnTrainFeatures <- nnTrainData[-9]
nnTrainLabels <- nnTrainData[9]
nnTestFeatures <- nnTestData[-9]
nnTestLabels <- nnTestData[9]
nnValFeatures <- nnValData[-9]
nnValLabels <- nnValData[9]
nnValBalFeatures <- nnValBal[-9]
nnValBalLabels <- nnValBal[9]
nnValLabels$stroke <- ifelse(nnValLabels$stroke == 'No', 0, 1)
nnTestLabels$stroke <- ifelse(nnTestLabels$stroke == 'No', 0, 1)
nnTrainLabels$stroke <- ifelse(nnTrainLabels$stroke == 'No', 0, 1)
nnFullTrainLabels$stroke <- ifelse(nnFullTrainLabels$stroke == 'No', 0, 1)
nnValBalLabels$stroke <- ifelse(nnValBalLabels$stroke == 'No', 0, 1)
### Stroke Distrubution for Train Dataset
-------------
0 1
------ ------
1642 1629
-------------
### Stroke Distrubution for fullTrain Dataset
-------------
0 1
------ ------
2021 2067
-------------
one-hot encode categorical features
dummyFullTrain <- dummyVars("~ .", data=nnFullTrainFeatures)
dummyTrain <- dummyVars("~ .", data=nnTrainFeatures)
nnFullTrainFeatures <- data.frame(predict(dummyFullTrain,newdata=nnFullTrainFeatures))
nnTestFeatures <- data.frame(predict(dummyFullTrain,newdata=nnTestFeatures))
nnTrainFeatures <- data.frame(predict(dummyTrain,newdata=nnTrainFeatures))
nnValFeatures <- data.frame(predict(dummyTrain,newdata=nnValFeatures))
nnValBalFeatures <- data.frame(predict(dummyTrain,newdata=nnValBalFeatures))
Neural Network Model Tuning
clean_runs(confirm=FALSE)
set.seed(123)
runs <- tuning_run("ann.R", flags = list(nodes1 = c(20, 50,75), nodes2=c(20,50,75), drop1=c(0.2,0.4,0.6), drop2=c(0.2,0.4,0.6), learning_rate = c(0.01,0.05,0.001,0.0001), epochs = c(25,50,100), batch_size = c(10,25,50), activation = c("relu", "sigmoid", "tanh")),sample = 0.02, confirm=FALSE, echo = FALSE)
rundf <- ls_runs(order='metric_val_accuracy', decreasing=TRUE)
rundf
Data frame: 175 x 28
# ... with 165 more rows
# ... with 23 more columns:
# flag_nodes1, flag_nodes2, flag_batch_size, flag_activation, flag_learning_rate,
# flag_epochs, flag_drop1, flag_drop2, epochs, epochs_completed, metrics, model,
# loss_function, optimizer, learning_rate, script, start, end, completed, output,
# source_code, context, type
bestrun <- rundf[1,1:19]
Best Run Details
| run_dir |
runs/2021-04-28T05-06-54Z |
| metric_val_accuracy |
0.951 |
| metric_loss |
0.6952 |
| metric_accuracy |
0.5032 |
| metric_val_loss |
0.6749 |
| flag_nodes1 |
50 |
| flag_nodes2 |
50 |
| flag_batch_size |
10 |
| flag_activation |
relu |
| flag_learning_rate |
0.05 |
| flag_epochs |
50 |
| flag_drop1 |
0.4 |
| flag_drop2 |
0.6 |
| epochs |
50 |
| epochs_completed |
50 |
| metrics |
runs/2021-04-28T05-06-54Z/tfruns.d/metrics.json |
| model |
Model Model: “sequential” ________________________________________________________________________________ Layer (type) Output Shape Param # ================================================================================ dense_2 (Dense) (None, 50) 800 ________________________________________________________________________________ dropout_1 (Dropout) (None, 50) 0 ________________________________________________________________________________ dense_1 (Dense) (None, 50) 2550 ________________________________________________________________________________ dropout (Dropout) (None, 50) 0 ________________________________________________________________________________ dense (Dense) (None, 1) 51 ================================================================================ Total params: 3,401 Trainable params: 3,401 Non-trainable params: 0 ________________________________________________________________________________ |
| loss_function |
binary_crossentropy |
| optimizer |
<tensorflow.python.keras.optimizer_v2.adam.Adam> |
Create model using paramters from best tuning run
model <- keras_model_sequential()
model %>%
layer_dense(units = 50, activation = "relu", input_shape = dim(nnTrainFeatures)[2]) %>%
layer_dropout(rate=0.4) %>%
layer_dense(units = 50, activation = "relu") %>%
layer_dropout(rate=0.6) %>%
layer_dense(units = 1, activation = 'sigmoid')
model %>% compile(
optimizer = optimizer_adam(lr=0.05),
loss='binary_crossentropy',
metrics = c('accuracy')
)
set.seed(234)
history <- model %>% fit(as.matrix(nnTrainFeatures), as.matrix(nnTrainLabels), epochs=50, batch_size=10,validation_data=list(as.matrix(nnValFeatures), as.matrix(nnValLabels)))
Evaluate NN Model Performance
set.seed(123)
model %>% evaluate(as.matrix(nnValFeatures), as.matrix(nnValLabels))
1/26 [>.............................] - ETA: 0s - loss: 0.4610 - accuracy: 0.5000
26/26 [==============================] - 0s 627us/step - loss: 0.4428 - accuracy: 0.4908
loss
26/26 [==============================] - 0s 634us/step - loss: 0.4428 - accuracy: 0.4908
accuracy
0.4427884 0.4908201
nn_pred <- model %>% predict_classes(as.matrix(nnValFeatures))
nn_prob <- model %>% predict_proba(as.matrix(nnValFeatures))
nn_prob <- data.frame(nn_prob)
colnames(nn_prob) <- c('Yes')
nn_prob$No <- 1 - nn_prob$Yes
nn_results2 <- data.frame(nn_pred, nnValLabels, nn_prob)
colnames(nn_results2) <- c("pred", "obs", "Yes", "No")
nn_results2$pred <-as.factor(ifelse(nn_results2$pred == 0, 'No', 'Yes'))
nn_results2$obs <-as.factor(ifelse(nn_results2$obs == 0, 'No', 'Yes'))
nn_2class <- twoClassSummary(nn_results2, lev=c("No", "Yes"))
confusionMatrix(nn_results2$pred,nn_results2$obs, positive='Yes')
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 361 0
Yes 416 40
Accuracy : 0.4908
95% CI : (0.456, 0.5257)
No Information Rate : 0.951
P-Value [Acc > NIR] : 1
Kappa : 0.0783
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.00000
Specificity : 0.46461
Pos Pred Value : 0.08772
Neg Pred Value : 1.00000
Prevalence : 0.04896
Detection Rate : 0.04896
Detection Prevalence : 0.55814
Balanced Accuracy : 0.73230
'Positive' Class : Yes
nn_2class[1]
ROC
0.7342342
plot(history)

nn_roc <- roc(nnValLabels$stroke ~ nn_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases

Model Comparison
Model Performace Comparison
| knn |
0.7889 |
0.7773 |
0.625 |
| glmnet |
0.8555 |
0.7091 |
0.85 |
| rpart |
0.7064 |
0.9936 |
0.1 |
| rf |
0.8387 |
0.5354 |
0.95 |
| svmLinear |
0.8488 |
0.7516 |
0.825 |
| gbm |
0.6796 |
0.003861 |
1 |
| neural network |
0.7342 |
0.4646 |
1 |
pred_rf= prediction(rf_prob$Yes,valDataLabels$stroke)
performance(pred_rf, measure = "auc")@y.values
rf_roc <- roc(valDataLabels$stroke ~ rf_prob$Yes, plot=TRUE, print.auc=FALSE, col='green', lwd=4, legacy.axes=TRUE, main="ROC Curve")
pred_knn = prediction(knn_prob$Yes, valDataLabels$stroke)
performance(pred_knn, measure='auc')@y.values
knn_roc <- roc(valDataLabels$stroke ~ knn_prob$Yes, plot=TRUE, print.auc=FALSE, col='blue', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
pred_glm <- prediction(glm_prob$Yes, valDataLabels$stroke)
glm_perform <- performance(pred_glm, measure='auc')@y.values
glm_roc <- roc(valDataLabels$stroke ~ glm_prob$Yes, plot=TRUE, print.auc=FALSE, col='orange', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
pred_tree <- prediction(tree_prob$Yes, valDataLabels$stroke)
performance(pred_tree, measure='auc')@y.values
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=FALSE, col='yellow', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
pred_lsvm <- prediction(lsvm_prob$Yes, valDataLabels$stroke)
performance(pred_lsvm, measure='auc')@y.values
lsvm_roc <- roc(valDataLabels$stroke ~ lsvm_prob$Yes, plot=TRUE, print.auc=FALSE, col='purple', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
pred_gbm <- prediction(gbm_prob$Yes, valDataLabels$stroke)
performance(pred_gbm, measure='auc')@y.values
gbm_roc <- roc(valDataLabels$stroke ~ gbm_prob$Yes, plot=TRUE, print.auc=FALSE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
pred_nn = prediction(nn_prob$Yes,nnValLabels$stroke)
performance(pred_nn, measure='auc')@y.values
nn_roc <- roc(nnValLabels$stroke ~ nn_prob$Yes, plot=TRUE, print.auc=FALSE, col='brown', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)
legend("bottomright", legend=c("knn", 'rf', 'glmnet', 'rpart', 'svmLinear', 'gbm', 'neural network'), col=c("blue", "green", 'orange', 'yellow', 'purple', 'black', 'brown'), lwd=2)

---
title: "Stroke Prevention"
subtitle: "CSC532B Spring 2021"
author: "Thomas Champion"
output: html_notebook
---

```{r include=FALSE}
require(plyr)
require(dplyr)
require(pander)
require(ggplot2)
require(gridExtra)
require(caret)
require(DMwR)
require(ROSE)
require(keras)
use_session_with_seed(42)
require(tensorflow)
require(tfruns)
require(ROCR)
require(pROC)
require(OneR)
require(glmnet)
require(rattle)
require(rpart)
require(rpart.plot)
require(gbm)
require(xgboost)
```

# Load and Explore Data
```{r}
strokes <- read.csv("./data/stroke_data.csv")
str(strokes)
summary(strokes)
```




### Verify that the ID column is in fact unique
### Remove ID column since this is an assigned identifier and as such does not impact stroke

```{r}
uniqueID <- unique(strokes$id)
length(uniqueID)
dim(strokes)
strokes <- strokes[-1]
```

### Since all the positive stroke values are found at the start of the file we will randomize the dataframe

```{r}
set.seed(42)
rows <- sample(nrow(strokes))
strokes <- strokes[rows,]
```

### Review category distrubution for categorical variables
```{r echo=FALSE, results='asis'}
pandoc.header("Gender Entries",5)
pandoc.table(table(strokes$gender))

pandoc.header("Ever Married Entries",5)
pandoc.table(table(strokes$ever_married))

pandoc.header("Work Type Entries",5)
pandoc.table(table(strokes$work_type))

pandoc.header("Residence TypeEntries",5)
pandoc.table(table(strokes$Residence_type))

pandoc.header("Smoking Status Entries",5)
pandoc.table(table(strokes$smoking_status))

pandoc.header("HypertensionEntries",5)
pandoc.table(table(strokes$hypertension))

pandoc.header("Heart Disease Entries",5)
pandoc.table(table(strokes$heart_disease))



pandoc.header("Smoking Status Entries",5)
pandoc.table(table(strokes$smoking_status))

pandoc.header("Stroke Entries",5)
pandoc.table(table(strokes$stroke))

```
### Update binary categorical variables to numeric
### remove gender with value of 'other', code "Female as 0, male as 1
```{r}
strokes <- strokes %>% filter(gender != "Other")
strokes$gender <- ifelse(strokes$gender == "Female", 0,1)
```

### convert residence type to binary response. 0=rural, 1=urban
```{r}
strokes$Residence_type = ifelse(strokes$Residence_type == "Rural", 0, 1)
```

### convert ever married to binary response. 0=No, 1=Yes
```{r}
strokes$ever_married <- ifelse(strokes$ever_married == 'No', 0,1)
```


### Convert stroke to 'yes/no' values

```{r}
strokes$stroke <- ifelse(strokes$stroke == 1, "Yes", "No")
```


### Convert multi-category categorical features to factors
```{r}
# strokes$gender <- as.factor(strokes$gender)
# strokes$hypertension <- as.factor(strokes$hypertension)
# strokes$heart_disease <- as.factor(strokes$heart_disease)
# strokes$ever_married <- as.factor(strokes$ever_married)
strokes$work_type <- as.factor(strokes$work_type)
# strokes$Residence_type <- as.factor(strokes$Residence_type)
strokes$smoking_status <- as.factor(strokes$smoking_status)
strokes$stroke <- as.factor(strokes$stroke)
strokes$smoking_status <- revalue(strokes$smoking_status, c("formerly smoked"="formerly_smoked", "never smoked"="never_smoked", "smokes"="smokes", "Unknown"="unknown"))
strokes$work_type <- revalue(strokes$work_type, c("Self-employed"="Self_employed"))

```


### Clean bmi feature   
* Convert "N/A" entries to NA values
* Convert from character to numeric values
```{r}
strokes <- na_if(strokes,"N/A")
strokes$bmi <- as.numeric(strokes$bmi)
dim(strokes)

```
## Data Exploration
```{r echo=FALSE,results='asis'}
missing <- sapply(strokes, function(x) sum(is.na(x)))
pandoc.header("Missing Values", 4)
pandoc.table(missing)
```
```{r}
require(tidyverse)
require(rcompanion)


# Calculate a pairwise association between all variables in a data-frame. In particular nominal vs nominal with Chi-square, numeric vs numeric with Pearson correlation, and nominal vs numeric with ANOVA.
# Adopted from https://stackoverflow.com/a/52557631/590437
mixed_assoc = function(df, cor_method="spearman", adjust_cramersv_bias=TRUE){
    df_comb = expand.grid(names(df), names(df),  stringsAsFactors = F) %>% set_names("X1", "X2")

    is_nominal = function(x) class(x) %in% c("factor", "character")
    # https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
    # https://github.com/r-lib/rlang/issues/781
    is_numeric <- function(x) { is.integer(x) || is_double(x)}

    f = function(xName,yName) {
        x =  pull(df, xName)
        y =  pull(df, yName)

        result = if(is_nominal(x) && is_nominal(y)){
            # use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
            cv = cramerV(as.character(x), as.character(y), bias.correct = adjust_cramersv_bias)
            data.frame(xName, yName, assoc=cv, type="cramersV")

        }else if(is_numeric(x) && is_numeric(y)){
            correlation = cor(x, y, method=cor_method, use="complete.obs")
            data.frame(xName, yName, assoc=correlation, type="correlation")

        }else if(is_numeric(x) && is_nominal(y)){
            # from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
            r_squared = summary(lm(x ~ y))$r.squared
            data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")

        }else if(is_nominal(x) && is_numeric(y)){
            r_squared = summary(lm(y ~x))$r.squared
            data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")

        }else {
            warning(paste("unmatched column type combination: ", class(x), class(y)))
        }

        # finally add complete obs number and ratio to table
        result %>% mutate(complete_obs_pairs=sum(!is.na(x) & !is.na(y)), complete_obs_ratio=complete_obs_pairs/length(x)) %>% rename(x=xName, y=yName)
    }

    # apply function to each variable combination
    map2_df(df_comb$X1, df_comb$X2, f)
}
```
### Associations between features
```{r}
associations <- mixed_assoc(strokes)
associations
```
## bmi imputation
#### bmi has multiple missing values. Instead of using the mean or median of the entire data set to impute the missing values determine if a second feature has an association with bmi

### association of bmi with other variables
```{r}
bmi_assoc <- associations %>% filter(y == "bmi")
```

### Plots of bmi vs other features   

```{r}
# ggplot(data=associations, aes(x=x, y=y, fill=assoc)) + geom_tile() + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1), axis.title = element_blank(), plot.title=element_text(hjust=0.5)) + ggtitle("Association Between Features")
ggplot(data=bmi_assoc, aes(x=x, y=y, fill=assoc)) + geom_tile() + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1, size=20), axis.text.y = element_text(size=20),axis.title = element_blank(), plot.title=element_text(hjust=0.5)) + ggtitle("Association with bmi")
plot(bmi ~ ., data=strokes)
```

#### Age has an association with bmi, so the data will be binned by age and the missing bmi values will be imputed based upon the median value of the bin they fall in


### Create bins for age feature and set missing BMI values to median of the bin they fall in
```{r echo=TRUE}
strokes_original <- strokes
strokes$age_cat <- bin(strokes$age, nbins=10, labels=c('a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8','a9', 'a10'), method = "clusters")

strokes <- strokes %>% group_by(age_cat) %>% mutate(bmi.new = median(bmi, na.rm=TRUE))
strokes$bmi <- ifelse(is.na(strokes$bmi), strokes$bmi.new, strokes$bmi)

# remove working columns
strokes <- strokes[-c(12,13)]
```

### Verify missing values are resolved and data types are as expected
``` {r echo=FALSE, results='asis'}
missing <- sapply(strokes, function(x) sum(is.na(x)))
pandoc.header("Missing Values", 4)
pandoc.table(missing)
```

```{r}



str(strokes)
```

### Explore relation of features with response variable 'stroke'




```{r}
p1 <- ggplot(data=strokes, aes(stroke, fill=as.factor(gender))) + geom_bar(position='fill') + labs(fill="gender")
p2 <- ggplot(data=strokes, aes(stroke, age)) + geom_boxplot()
p3 <- ggplot(data=strokes, aes(stroke, fill=as.factor(hypertension))) + geom_bar(position='fill') + labs(fill="hypertension")
p4 <- ggplot(data=strokes, aes(stroke, fill=as.factor(heart_disease))) + geom_bar(position='fill') + labs(fill='heart_disease')
p5 <- ggplot(data=strokes, aes(stroke, fill=as.factor(ever_married))) + geom_bar(position='fill') + labs(fill="ever_married")
p6 <- ggplot(data=strokes, aes(stroke, fill=work_type)) + geom_bar(position='fill')
p7 <- ggplot(data=strokes, aes(stroke, fill=as.factor(Residence_type))) + geom_bar(position='fill') + labs(fill="Residence_type")
p8 <- ggplot(data=strokes, aes(stroke, avg_glucose_level)) + geom_boxplot()
p9 <- ggplot(data=strokes, aes(stroke, bmi)) + geom_boxplot()
p10 <- ggplot(data=strokes, aes(stroke, fill=smoking_status)) + geom_bar(position='fill')

grid.arrange(p1,p2,p3,p4,nrow=2)
grid.arrange(p5,p6,p7,p8,nrow=2)
grid.arrange(p9,p10,nrow=1)

```
```{r echo=FALSE, results='asis'}
age_t <- t.test(strokes$age~strokes$stroke, alternative='two.sided')
glucose_t <- t.test(strokes$avg_glucose_level~strokes$stroke, alternative='two.sided')
bmi_t <- t.test(strokes$bmi~strokes$stroke, alternative='two.sided')


smoking_chi2 <- strokes %>% select(smoking_status,stroke) %>% table() %>% chisq.test()
gender_chi2 <- strokes %>% select(gender,stroke) %>% table() %>% chisq.test()
hypertension_chi2 <- strokes %>% select(hypertension,stroke) %>% table() %>% chisq.test()
hd_chi2 <- strokes %>% select(heart_disease,stroke) %>% table() %>% chisq.test()
em_chi2 <- strokes %>% select(ever_married,stroke) %>% table() %>% chisq.test()
wt_chi2 <- strokes %>% select(work_type,stroke) %>% table() %>% chisq.test()
rt_chi2 <- strokes %>% select(Residence_type,stroke) %>% table() %>% chisq.test()

agep <- age_t$p.value
glucosp <- glucose_t$p.value
bmip <- bmi_t$p.value

smokingp <- smoking_chi2$p.value
genderp <- gender_chi2$p.value
hypertensionp <- hypertension_chi2$p.value
hdp <- hd_chi2$p.value
emp <- em_chi2$p.value
wtp <- wt_chi2$p.value
rtp <- rt_chi2$p.value

ttest_values <- c(agep, glucosp, bmip)
ttest_labels <- c("age", "glucose", "bmi")

chisq_values <- c(smokingp, genderp, hypertensionp, hdp, emp, wtp, rtp)
chisq_labels <- c("smoking_status", "gender", "hypertension", "heart_disease", "ever_married", "work_type", "residence_type")

tdf <- data.frame(ttest_labels, ttest_values)
cdf <- data.frame(chisq_labels, chisq_values)



```
```{r echo=FALSE, results='asis'}
pandoc.header("Chi-Square Tests", 3)
pandoc.table(cdf)

pandoc.header("T-Tests",3)
pandoc.table(tdf)
```



#### The features gender and residence type do not show a statistically significant association with the stroke outcome. This can be observed visually as well as by the Chi-Square tests.

### Review distribution of numeric features

```{r}
hist(strokes$bmi, main="bmi", xlab="bmi")
hist(strokes$avg_glucose_level, main="average glucose level", xlab="avg_glucose_level")
hist(strokes$age, main="age", xlab="age")


```

#### Both avg_glucose_level and bmi are right-skewed. To help correct this both variables will be replaced with their respective log values
```{r}
strokes$bmi <- log(strokes$bmi)
strokes$avg_glucose_level <- log(strokes$avg_glucose_level)

hist(strokes$bmi)
hist(strokes$avg_glucose_level)
```
#### Since neither gender nor residence_type show any statistically significant correlation with the stroke outcome both features will be removed to avoid any unintentional modle complexity or noise
 
```{r}
strokes <- strokes[-c(1,7)]
```
 
 
 
 

## Prepare Data Sets for Model Usage
* Split into train, validation and test datasets   
* Create balanced training data sets   
* Separate into features and labels
* Scale Datasets  
* One-Hot Encode Datasets
```{r echo=TRUE, results = 'hide'}

# create train/test dataset as well as subset of train for cross-validation and evaluation usage
set.seed(123)
selection <- createDataPartition(strokes$stroke, p=0.8, list=FALSE)   
trainDataFull = strokes[selection,]    
testData = strokes[-selection,]
set.seed(123)
selection2 <- createDataPartition(trainDataFull$stroke, p=0.8, list=FALSE)
trainData = trainDataFull[selection2,]
valData = trainDataFull[-selection2,]

#create balanced training datasets

trainDataBalanced <- ROSE(stroke ~ ., data=trainData, p=0.5)$data
trainDataFullBalanced <- ROSE(stroke ~ ., data=trainDataFull, p=0.5)$data

# separate features and labels

valDataFeatures <- valData[-9]
valDataLabels <- valData[9]

testDataFeatures <- testData[-9]
testDataLabels <- testData[9]

trainDataFeatures <- trainData[-9]
trainDataLabels <- trainData[9]

trainDataFullFeatures <- trainDataFull[-9]
trainDataFullLabels <- trainDataFull[9]

trainDataBalancedFeatures <- trainDataBalanced[-9]
trainDataBalancedLabels <- trainDataBalanced[9]

trainDataFullBalancedFeatures <- trainDataFullBalanced[-9]
trainDataFullBalancedLabels <- trainDataFullBalanced[9]

# scale features data sets. testData scales with trainDataFull while valData scales with trainData

preProcScaleFullTrain <- preProcess(trainDataFullBalancedFeatures, method=c("center", "scale"))
preProcScaleTrain <- preProcess(trainDataBalancedFeatures, method=c("center", "scale"))

trainDataFullBalancedFeatures_s <- predict(preProcScaleFullTrain, trainDataFullBalancedFeatures)
testDataFeatures_s <- predict(preProcScaleFullTrain, testDataFeatures)

trainDataBalancedFeatures_s <- predict(preProcScaleTrain, trainDataBalancedFeatures)
valDataFeatures_s <- predict(preProcScaleTrain, valDataFeatures)

# one-hot encode  datasets
dummy1 <- dummyVars("~ .", data=trainDataFullBalancedFeatures_s)
trainDataFullBalancedFeatures_se <- data.frame(predict(dummy1,newdata=trainDataFullBalancedFeatures_s))

dummy2 <- dummyVars("~ .", data=trainDataBalancedFeatures)
trainDataBalancedFeatures_e <- data.frame(predict(dummy2,newdata=trainDataBalancedFeatures))

dummy3 <- dummyVars("~ .", data=trainDataFeatures)
trainDataFeatures_e <- data.frame(predict(dummy3,newdata=trainDataFeatures))

dummy4 <- dummyVars("~ .", data=trainDataFullFeatures)
trainDataFullFeatures_e <- data.frame(predict(dummy4,newdata=trainDataFullFeatures))


testDataFeatures_se <- data.frame(predict(dummy1,newdata=testDataFeatures_s))


valDataFeatures_e <- data.frame(predict(dummy2,newdata=valDataFeatures))

fullsize <- dim(trainDataFull)
trainsize <- dim(trainData)
valsize <- dim(valData)
testsize <- dim(testData)

distfull <- table(trainDataFull$stroke)
disttrain <- table(trainData$stroke)
distval <- table(valData$stroke)
disttest <- table(testData$stroke)


```
### Verify dimensions and distribution of response variable for datasets   

```{r echo=FALSE, results='asis'}
pandoc.header("trainDataFull", 4)
pandoc.table(fullsize)

pandoc.header("trainData", 4)
pandoc.table(trainsize)

pandoc.header("valData", 4)
pandoc.table(valsize)

pandoc.header("testData",4)
pandoc.table(testsize)

pandoc.header("trainDataFull", 4)
pandoc.table(prop.table(distfull))

pandoc.header("trainData", 4)
pandoc.table(prop.table(disttrain))

pandoc.header("valData", 4)
pandoc.table(prop.table(distval))

pandoc.header("testData",4)
pandoc.table(prop.table(disttest))



```

#### The proportion of "no" to "yes" values in all datasets is equivalent.







# Fit Models
## Define control for models trained with Caret
```{r}
myControl <- trainControl(method='repeatedcv',
                          number=10,
                          repeats=5,
                          verboseIter = FALSE,
                          classProbs=TRUE,
                          summaryFunction=twoClassSummary,
                          selectionFunction="oneSE",
                          sampling='rose')

myControl2 <- trainControl(method='repeatedcv',
                          number=10,
                          repeats=5,
                          verboseIter = FALSE,
                          classProbs=TRUE,
                          summaryFunction=twoClassSummary,
                          selectionFunction="oneSE")
```



## KNN Model Training and Evaluation
Ensuring the square root of the number of observations is contained in the values of k being explored
```{r warning=FALSE}
set.seed(123)

knn_grid <- expand.grid(k=seq(1,100,by=2))

knn_train <- cbind(trainDataFeatures_e, trainDataLabels)
knn_model <- caret::train(stroke ~ ., data=knn_train, method='knn', trControl=myControl, tuneGrid = knn_grid, preProc = 'range', metric='ROC')



```




```{r}


plot(knn_model)

knn_pred <- predict(knn_model, newdata=valDataFeatures_e)
knn_prob <- predict(knn_model, newdata=valDataFeatures_e, type='prob')
knn_results <- confusionMatrix(knn_pred, valDataLabels$stroke, positive = "Yes")

knn_results2 <- data.frame(as.factor(knn_pred), as.factor(valDataLabels$stroke), knn_prob)
colnames(knn_results2) <- c("pred", "obs", "No", "Yes")
knn_model
knn_results
knn_2class <- twoClassSummary(knn_results2, lev = c("No", "Yes"))
knn_2class[1]
knn_roc <- roc(valDataLabels$stroke ~ knn_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
```




## Logistic Regression Model with Elastic Net Regularization
```{r}
set.seed(123)

glm_grid <- expand.grid(alpha=seq(0,1, length=10), lambda = 10^seq(-3,3, length=100))

glm_train <- cbind(trainDataFeatures_e, trainDataLabels)
glm_model <- caret::train(stroke ~ ., data=glm_train, method='glmnet', trControl=myControl, tuneGrid = glm_grid, preProc = "range", metric='ROC', family='binomial')
```

``` {R}


glm_pred <- predict(glm_model, newdata = valDataFeatures_e)
glm_prob <- predict(glm_model, newdata = valDataFeatures_e, type='prob')
glm_results <- confusionMatrix(glm_pred, valDataLabels$stroke, positive="Yes")

glm_results


glm_results2 <- data.frame(as.factor(glm_pred), as.factor(valDataLabels$stroke), glm_prob)


colnames(glm_results2) <- c("pred", "obs", "No", "Yes")



glm_2class <- twoClassSummary(glm_results2, lev=c("No", "Yes"))
glm_2class[1]
glm_roc <- roc(valDataLabels$stroke ~ glm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
coef(glm_model$finalModel,glm_model$bestTune$lambda)
varImp(glm_model)
```



## Decision Tree Model

```{r}


set.seed(123)

tree_grid <- expand.grid(cp=seq(0.0,1.0,by=.001))



tree_data <- cbind(trainDataFeatures, trainDataLabels)
tree_model <- caret::train(stroke ~.,data=tree_data, method='rpart', trControl=myControl, preProc = "range", metric='ROC', tuneGrid=tree_grid)

```

```{r}




tree_pred <- predict(tree_model, valDataFeatures)
tree_prob <- predict(tree_model, newdata = valDataFeatures, type='prob')
tree_results <- confusionMatrix(tree_pred, valDataLabels$stroke, positive="Yes")
tree_results


tree_results2 <- data.frame(as.factor(tree_pred), as.factor(valDataLabels$stroke), tree_prob)


colnames(tree_results2) <- c("pred", "obs", "No", "Yes")



tree_2class <- twoClassSummary(tree_results2, lev=c("No", "Yes"))
tree_2class[1]
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
fancyRpartPlot(tree_model$finalModel)
```
Considering the poor performance of the decision tree, a second model will be built but will not balance the dataset prior to use


```{r}


set.seed(123)

tree_grid <- expand.grid(cp=seq(0.0,1.0,by=.001))



tree_data <- cbind(trainDataFeatures, trainDataLabels)
tree_model <- caret::train(stroke ~.,data=tree_data, method='rpart', trControl=myControl2, preProc = "range", metric='ROC', tuneGrid=tree_grid)

```

```{r}




tree_pred <- predict(tree_model, valDataFeatures)
tree_prob <- predict(tree_model, newdata = valDataFeatures, type='prob')
tree_results <- confusionMatrix(tree_pred, valDataLabels$stroke, positive="Yes")
tree_results


tree_results2 <- data.frame(as.factor(tree_pred), as.factor(valDataLabels$stroke), tree_prob)


colnames(tree_results2) <- c("pred", "obs", "No", "Yes")



tree_2class <- twoClassSummary(tree_results2, lev=c("No", "Yes"))
tree_2class[1]
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)
fancyRpartPlot(tree_model$finalModel)
```

## Random Forest Model

```{r}

rf_grid <- expand.grid(mtry=c(1,2,4,6,8))
set.seed(123)

rf_data <- cbind(trainDataFeatures, trainDataLabels)
rf_model <- caret::train(stroke ~.,data=rf_data, method='rf', trControl=myControl, preProc = "range", metric='ROC', tuneGrid=rf_grid)

rf_pred <- predict(rf_model, valDataFeatures)
rf_prob <- predict(rf_model, valDataFeatures, type='prob')
```

``` {r}
rf_results <- confusionMatrix(rf_pred,valDataLabels$stroke, positive="Yes")
rf_results
rf_results2 <- data.frame(as.factor(rf_pred), as.factor(valDataLabels$stroke), rf_prob)
colnames(rf_results2) <- c("pred", "obs", "No", "Yes")


rf_2class <- twoClassSummary(rf_results2, lev=c("No", "Yes"))
rf_2class[1]
rf_roc <- roc(valDataLabels$stroke ~ rf_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve")
varImp(rf_model, scale = TRUE)
rf_model
plot(rf_model)
```






## Linear SVM Model
```{r echo=TRUE}
set.seed(123)
lsvm_grid <- expand.grid(C = 3**(-7:7))
lsvm_data <- cbind(trainDataFeatures_e, trainDataLabels)
lsvm_model <- caret::train(stroke ~., data = lsvm_data, method = "svmLinear", trControl = myControl, preProcess = 'range', tuneGrid = lsvm_grid, metric= 'ROC')
```

```{r}
lsvm_pred <- predict(lsvm_model, valDataFeatures_e)
lsvm_prob <- predict(lsvm_model, valDataFeatures_e, type='prob')
lsvm_results <- confusionMatrix(lsvm_pred, valDataLabels$stroke, positive="Yes")
lsvm_results2 <- data.frame(as.factor(lsvm_pred), as.factor(valDataLabels$stroke), lsvm_prob)
lsvm_results

colnames(lsvm_results2) <- c("pred", "obs", "No", "Yes")
lsvm_2class <- twoClassSummary(lsvm_results2, lev=c("No", "Yes"))
lsvm_2class[1]
varImp(lsvm_model, scale = TRUE)
lsvm_roc <- roc(valDataLabels$stroke ~ lsvm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)

plot(lsvm_model, scales=list(x=list(log=3)))
lsvm_model

```

# Gradient Boost Model

```{r message=FALSE, warning=FALSE, include=TRUE}
gbm_Grid <-  expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
                    n.trees = seq(250,2000,250), 
                    shrinkage = seq(.0005, .05,.0005),
                    n.minobsinnode = 10)

set.seed(123)
gbm_data = cbind(trainDataFeatures_e, trainDataLabels)
gbm_model <- caret::train(stroke ~., data = gbm_data, method = "gbm", trControl = myControl, preProcess = 'range', metric= 'ROC', tuneLength = 10)
```

```{r}
gbm_pred <- predict(gbm_model, valDataFeatures_e)
gbm_prob <- predict(gbm_model, valDataFeatures_e, type='prob')

gbm_results <- confusionMatrix(gbm_pred, valDataLabels$stroke, positive='Yes')

gbm_results

gbm_results2 <- data.frame(as.factor(gbm_pred), as.factor(valDataLabels$stroke), gbm_prob)


colnames(gbm_results2) <- c("pred", "obs", "No", "Yes")



gbm_2class <- twoClassSummary(gbm_results2, lev=c("No", "Yes"))
gbm_2class[1]
gbm_roc <- roc(valDataLabels$stroke ~ gbm_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)

gbm_model
plot(gbm_model)

```



# Neural Network Model
## Scale datasets for usage
```{r}

numerical <- c(1,6,7)


trainDataFullNum <- trainDataFull[numerical]
trainDataFullCat <- trainDataFull[-numerical]



trainDataNum <- trainData[numerical]
trainDataCat <- trainData[-numerical]


valDataNum <- valData[numerical]
valDataCat <- valData[-numerical]


testDataNum <- testData[numerical]
testDataCat <- testData[-numerical]

trainDataFullScaled <- scale(trainDataFullNum)
col_means_train <- attr(trainDataFullScaled, "scaled:center")
col_stddevs_train <- attr(trainDataFullScaled, "scaled:scale")

testDataScaled <- scale(testDataNum, center = col_means_train, scale = col_stddevs_train)  

trainDataScaled <- scale(trainDataNum)
col_means_train <- attr(trainDataScaled, "scaled:center")
col_stddevs_train <- attr(trainDataScaled, "scaled:scale")

valDataScaled <- scale(valDataNum, center = col_means_train, scale = col_stddevs_train)  

nnFullTrainData <- cbind(trainDataFullScaled, trainDataFullCat)
nnTrainData <- cbind(trainDataScaled, trainDataCat)
nnTestData <- cbind(testDataScaled, testDataCat)
nnValData <- cbind(valDataScaled, valDataCat)


# balance training data

nnTrainData <- ROSE(stroke ~ ., data=nnTrainData, p=0.5)$data

nnFullTrainData <- ROSE(stroke ~ ., data=nnFullTrainData, p=0.5)$data

nnValBal <- ROSE(stroke ~ ., data=nnValData, p=0.5)$data

```

## separate features from labels

```{r}
nnFullTrainFeatures <- nnFullTrainData[-9]
nnFullTrainLabels <- nnFullTrainData[9]

nnTrainFeatures <- nnTrainData[-9]
nnTrainLabels <- nnTrainData[9]

nnTestFeatures <- nnTestData[-9]
nnTestLabels <- nnTestData[9]

nnValFeatures <- nnValData[-9]
nnValLabels <- nnValData[9]

nnValBalFeatures <- nnValBal[-9]
nnValBalLabels <- nnValBal[9]

nnValLabels$stroke <- ifelse(nnValLabels$stroke == 'No', 0, 1)
nnTestLabels$stroke <- ifelse(nnTestLabels$stroke == 'No', 0, 1)
nnTrainLabels$stroke <- ifelse(nnTrainLabels$stroke == 'No', 0, 1)
nnFullTrainLabels$stroke <- ifelse(nnFullTrainLabels$stroke == 'No', 0, 1)
nnValBalLabels$stroke <- ifelse(nnValBalLabels$stroke == 'No', 0, 1)


```

```{r echo=FALSE}
pandoc.header("Stroke Distrubution for Train Dataset",3)
pandoc.table(table(nnTrainLabels$stroke))

pandoc.header("Stroke Distrubution for fullTrain Dataset",3)
pandoc.table(table(nnFullTrainLabels$stroke))
```


## one-hot encode categorical features

```{r}

dummyFullTrain <- dummyVars("~ .", data=nnFullTrainFeatures) 
dummyTrain <- dummyVars("~ .", data=nnTrainFeatures)

nnFullTrainFeatures <- data.frame(predict(dummyFullTrain,newdata=nnFullTrainFeatures))
nnTestFeatures <- data.frame(predict(dummyFullTrain,newdata=nnTestFeatures))

nnTrainFeatures <- data.frame(predict(dummyTrain,newdata=nnTrainFeatures))
nnValFeatures <- data.frame(predict(dummyTrain,newdata=nnValFeatures))
nnValBalFeatures <- data.frame(predict(dummyTrain,newdata=nnValBalFeatures))


```






## Neural Network Model Tuning
```{r echo=TRUE, message=FALSE, results='hide'}
 
clean_runs(confirm=FALSE)
set.seed(123)
runs <- tuning_run("ann.R", flags = list(nodes1 = c(20, 50,75), nodes2=c(20,50,75), drop1=c(0.2,0.4,0.6), drop2=c(0.2,0.4,0.6), learning_rate = c(0.01,0.05,0.001,0.0001), epochs = c(25,50,100), batch_size = c(10,25,50), activation = c("relu", "sigmoid", "tanh")),sample = 0.02, confirm=FALSE, echo = FALSE)
```

```{r}
rundf <- ls_runs(order='metric_val_accuracy', decreasing=TRUE)
rundf 

bestrun <- rundf[1,1:19]
```

```{r echo=FALSE, results='asis'}
pandoc.header("Best Run Details",3)
pandoc.table(t(bestrun))
```

## Create model using paramters from best tuning run



```{r include=TRUE, results='hide'}
model <- keras_model_sequential()
model %>%
  layer_dense(units = 50, activation = "relu", input_shape = dim(nnTrainFeatures)[2]) %>%
  layer_dropout(rate=0.4) %>%
  layer_dense(units = 50, activation = "relu") %>%
  layer_dropout(rate=0.6) %>%
  layer_dense(units = 1, activation = 'sigmoid')

model %>% compile(
  optimizer = optimizer_adam(lr=0.05),
  
  loss='binary_crossentropy',
  metrics = c('accuracy')
 
)

set.seed(234)
history <- model %>% fit(as.matrix(nnTrainFeatures), as.matrix(nnTrainLabels), epochs=50, batch_size=10,validation_data=list(as.matrix(nnValFeatures), as.matrix(nnValLabels)))
```
### Evaluate NN Model Performance
```{r}
set.seed(123)
model %>% evaluate(as.matrix(nnValFeatures), as.matrix(nnValLabels))

nn_pred <- model %>% predict_classes(as.matrix(nnValFeatures)) 
nn_prob <- model %>% predict_proba(as.matrix(nnValFeatures))
nn_prob <- data.frame(nn_prob)
colnames(nn_prob) <- c('Yes')
nn_prob$No <- 1 - nn_prob$Yes
nn_results2 <- data.frame(nn_pred, nnValLabels, nn_prob) 

colnames(nn_results2) <- c("pred", "obs", "Yes", "No")
nn_results2$pred <-as.factor(ifelse(nn_results2$pred == 0, 'No', 'Yes'))
nn_results2$obs <-as.factor(ifelse(nn_results2$obs == 0, 'No', 'Yes'))
nn_2class <- twoClassSummary(nn_results2, lev=c("No", "Yes"))
confusionMatrix(nn_results2$pred,nn_results2$obs, positive='Yes')
nn_2class[1]
plot(history)
nn_roc <- roc(nnValLabels$stroke ~ nn_prob$Yes, plot=TRUE, print.auc=TRUE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=FALSE)

```



# Model Comparison
```{r echo=FALSE, results='asis'}
compared <- data.frame(knn_2class, glm_2class, tree_2class, rf_2class, lsvm_2class, gbm_2class, nn_2class)
colnames(compared) <- c("knn", "glmnet", "rpart", "rf", "svmLinear", "gbm", "neural network")
compared <- t(compared)
colnames(compared) <- c("ROC", "Spec", "Sens")
pandoc.header("Model Performace Comparison",3)
pandoc.table(compared)
``` 

```{r results='hide'} 
pred_rf= prediction(rf_prob$Yes,valDataLabels$stroke)

performance(pred_rf, measure = "auc")@y.values
rf_roc <- roc(valDataLabels$stroke ~ rf_prob$Yes, plot=TRUE, print.auc=FALSE, col='green', lwd=4, legacy.axes=TRUE, main="ROC Curve")

pred_knn = prediction(knn_prob$Yes, valDataLabels$stroke)
performance(pred_knn, measure='auc')@y.values
knn_roc <- roc(valDataLabels$stroke ~ knn_prob$Yes, plot=TRUE, print.auc=FALSE, col='blue', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)

pred_glm <- prediction(glm_prob$Yes, valDataLabels$stroke)
glm_perform <- performance(pred_glm, measure='auc')@y.values
glm_roc <- roc(valDataLabels$stroke ~ glm_prob$Yes, plot=TRUE, print.auc=FALSE, col='orange', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)

pred_tree <- prediction(tree_prob$Yes, valDataLabels$stroke)
performance(pred_tree, measure='auc')@y.values
tree_roc <- roc(valDataLabels$stroke ~ tree_prob$Yes, plot=TRUE, print.auc=FALSE, col='yellow', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)


pred_lsvm <- prediction(lsvm_prob$Yes, valDataLabels$stroke)
performance(pred_lsvm, measure='auc')@y.values
lsvm_roc <- roc(valDataLabels$stroke ~ lsvm_prob$Yes, plot=TRUE, print.auc=FALSE, col='purple', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)

pred_gbm <- prediction(gbm_prob$Yes, valDataLabels$stroke)
performance(pred_gbm, measure='auc')@y.values
gbm_roc <- roc(valDataLabels$stroke ~ gbm_prob$Yes, plot=TRUE, print.auc=FALSE, col='black', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)

pred_nn = prediction(nn_prob$Yes,nnValLabels$stroke)
performance(pred_nn, measure='auc')@y.values
nn_roc <- roc(nnValLabels$stroke ~ nn_prob$Yes, plot=TRUE, print.auc=FALSE, col='brown', lwd=4, legacy.axes=TRUE, main="ROC Curve", add=TRUE)

legend("bottomright", legend=c("knn", 'rf', 'glmnet', 'rpart', 'svmLinear', 'gbm', 'neural network'), col=c("blue", "green", 'orange', 'yellow', 'purple', 'black', 'brown'), lwd=2)

```




# Build Final Model   
### Based upon Train/Validation runs of the models, the Elastic Net model having ROC: 0.8493, Specificity: 0.7079 and Sensitivity: 0.8500 showed the best performance. We will now train a glmnet model using the full training data set and evaluate it using the test dataset.
```{r}
fullData <- cbind(trainDataFullBalancedFeatures_se, trainDataFullBalancedLabels)

final_model <- glmnet(x=as.matrix(trainDataFullBalancedFeatures_se), y=as.matrix(trainDataFullBalancedLabels), family='binomial', alpha=glm_model$bestTune$alpha, lambda=glm_model$bestTune$lambda)
final_prediction <- predict(final_model, as.matrix(testDataFeatures_se),type='class')
final_probs <- predict(final_model, as.matrix(testDataFeatures_se),type='response')
final_prediction <- as.factor(final_prediction)
final_results <- data.frame(final_prediction, testDataLabels, final_probs)
colnames(final_results) <- c("pred", "obs", "Yes")
final_results$No <- 1- final_results$Yes

pred_final= prediction(final_results$Yes,testDataLabels$stroke)

performance(pred_final, measure = "auc")@y.values
final_roc <- roc(testDataLabels$stroke ~ final_results$Yes, plot=TRUE, 
                 print.auc=TRUE, col='red', lwd=4, legacy.axes=TRUE, main="ROC Curve")
confusionMatrix(final_prediction, testDataLabels$stroke, positive="Yes")
coef(final_model)

```

### Final metrics for the glmnet model are consistent with what was found during the cross-validation and tuning process
